In this piece of paper, a set of data obtained from Annual Status of Education Report (ASER) is explored. The raw data was downloaded from the link here. https://palnetwork.org/aser-centre/
library(tidyverse)
library(ggplot2)
library(ggthemes)
library(ggrepel)
library(gghighlight)
library(stringr)
library(dplyr)
library(sf)
library(scatterplot3d)
library(car)
library(ResourceSelection) #to excute Hosmer-Lemeshow test
## Warning: package 'ResourceSelection' was built under R version 4.0.3
# library(broom)
require(ggiraph)
## Warning: package 'ggiraph' was built under R version 4.0.3
require(ggiraphExtra)
# require(plyr)
provdist <- read.csv("aser/ASER2016ProvDist.csv")
school <- read.csv("aser/ASER2016GSchool.csv")
child <- read.csv("aser/ASER2016Child.csv")
pschool <- read.csv("aser/ASER2016PvtSchool.csv")
gschool <- read.csv("aser/ASER2016GSchool.csv")
parent <- read.csv("aser/ASER2016Parent.csv")
house <- read.csv("aser/ASER2016HouseholdIndicators.csv")
RegionName <- c("2" = "Panjab",
"3" = "Sindh",
"4" = "Balochistan",
"5" = "Khyber Pakhtunkhwa",
"6" = "Gilgit-Baltistan",
"7" = "Azad Jammu and Kashmir",
"8" = "Islamabad - ICT",
"9" = "Federally Administrated Tribal Areas")
Gender <- c("0" = "Male",
"-1" = "Female")
ica <- sf::st_read("map/pak_ica_categories_areas_geonode_apr2017.shp")
## Reading layer `pak_ica_categories_areas_geonode_apr2017' from data source `C:\Program Files\R\capstone\map\pak_ica_categories_areas_geonode_apr2017.shp' using driver `ESRI Shapefile'
## Simple feature collection with 156 features and 8 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: 60.8786 ymin: 23.69468 xmax: 77.83397 ymax: 37.08942
## geographic CRS: WGS 84
ica_df <- ica %>%
mutate(centroid = st_centroid(geometry),
x = st_coordinates(centroid)[,1],
y = st_coordinates(centroid)[,2]) %>%
as.data.frame()
## Warning: Problem with `mutate()` input `centroid`.
## x st_centroid does not give correct centroids for longitude/latitude data
## i Input `centroid` is `st_centroid(geometry)`.
## Warning in st_centroid.sfc(geometry): st_centroid does not give correct
## centroids for longitude/latitude data
ica_df <- ica_df %>% select(Province, Districts, x, y)
ica_df <- ica_df %>% summarize(Province = tolower(Province), Districts = tolower(Districts), x = x, y = y)
child_dname <- child %>% left_join(provdist[-1])
## Joining, by = "DID"
child_dname <- child_dname %>% mutate(dname = tolower(DNAME))
ica_df_3 <- ica_df %>% filter(Province == "sindh")
ica_df_3$Districts <- ica_df_3$Districts %>%
str_replace("ghotki", "gotki") %>%
str_replace("mirpur khas", "mirpurkhas") %>%
str_replace("malir karachi", "karachi-malir-rural") %>%
str_replace("naushahro feroze", "nowshero feroze") %>%
str_replace("kambar shahdad kot", "qambar shahdadkot") %>%
str_replace("sujawal", "sajawal") %>%
str_replace("shaheed benazir abad", "shaheed benazirabad") %>%
str_replace("tando allahyar", "tando allah yar") %>%
as.vector()
child_dname_3 <- child_dname %>% filter(RNAME == "Sindh") %>% left_join(ica_df_3, by = c("dname" = "Districts"))
child_dname_3 %>% group_by(dname) %>% summarize(n = sum(x))
## `summarise()` ungrouping output (override with `.groups` argument)
ica_df_3
child_ica_dummy <- child_ica %>% filter(!is.na(C002), !is.na(C003), !is.na(PR004), !is.na(PR009))
child_ica_dummy$C002_01 <- ifelse(child_ica_dummy$C002 == -1, 1, 0)
child_ica_dummy$C003_01 <- ifelse(child_ica_dummy$C003 == 3, 1, 0) # currently-enrolled
child_ica_dummy$C003_1_01 <- ifelse(child_ica_dummy$C003 == 1, 1, 0) # never-enrolled
child_ica_dummy$C003_2_01 <- ifelse(child_ica_dummy$C003 == 2, 1, 0) # drop-out
child_ica_dummy$PR004_01 <- ifelse(child_ica_dummy$PR004 == -1, 1, 0)
child_ica_dummy$PR009_01 <- ifelse(child_ica_dummy$PR009 == -1, 1, 0)
child_ica_dummy$PR004_PR009_01 <- ifelse(
child_ica_dummy$PR004 == -1 | child_ica_dummy$PR009 == -1, 1, 0)
child_ica_dummy$PR004_only_01 <- ifelse(
child_ica_dummy$PR004 == -1 & child_ica_dummy$PR009 == 0, 1, 0)
child_ica_dummy$PR009_only_01 <- ifelse(
child_ica_dummy$PR004 == 0 & child_ica_dummy$PR009 == -1, 1, 0)
child_ica_dummy$PR004_PR009_both_01 <- ifelse(
child_ica_dummy$PR004 == -1 & child_ica_dummy$PR009 == -1, 1, 0)
n_children_in_household <- child_ica_dummy %>%
group_by(HHID) %>%
summarize(n_children_in_household = length(unique(CID)))
## `summarise()` ungrouping output (override with `.groups` argument)
child_ica_dummy <- child_ica_dummy %>% left_join(n_children_in_household)
## Joining, by = "HHID"
child_ica_dummy$H002_1_01 <- ifelse(child_ica_dummy$H002 == 1, 1, 0)
child_ica_dummy$H002_2_01 <- ifelse(child_ica_dummy$H002 == 2, 1, 0)
child_ica_dummy$H002_3_01 <- ifelse(child_ica_dummy$H002 == 3, 1, 0)
child_ica_dummy <- child_ica_dummy
child_ica_dummy$Panjab <- ifelse(child_ica_dummy$RID == 2, 1, 0)
child_ica_dummy$Sindh <- ifelse(child_ica_dummy$RID == 3, 1, 0)
child_ica_dummy$Balochistan <- ifelse(child_ica_dummy$RID == 4, 1, 0)
child_ica_dummy$Khyber_Pakhtunkhwa <- ifelse(child_ica_dummy$RID == 5, 1, 0)
child_ica_dummy$Gilgit_Baltistan <- ifelse(child_ica_dummy$RID == 6, 1, 0)
child_ica_dummy$Azad_Jammu_and_Kashmir <- ifelse(child_ica_dummy$RID == 7, 1, 0)
child_ica_dummy$Islamabad_ICT <- ifelse(child_ica_dummy$RID == 8, 1, 0)
child_ica_dummy$Federally_Administrated_Tribal_Areas <- ifelse(child_ica_dummy$RID == 9, 1, 0)
dists_near_hunza_1 <- c(178, 199, 243, 259, 265, 266, 267, 270, 273, 274)
child_ica_dummy$DID_near_hunza_1 <- ifelse(child_ica_dummy$DID %in% dists_near_hunza_1, 1, 0)
dists_near_hunza_2 <- c(176, 178, 185, 199, 243, 245, 259, 265, 267, 270, 273, 274)
child_ica_dummy$DID_near_hunza_2 <- ifelse(child_ica_dummy$DID %in% dists_near_hunza_2, 1, 0)
child_ica_dummy$DID <- child_ica_dummy$DID %>% as.factor()
child_ica %>%
summarize(C002_na = sum(is.na(C002)),
C003_na = sum(is.na(C003)),
PR004_na = sum(is.na(PR004)),
PR009_na = sum(is.na(PR009)))
data.frame(original_rows = nrow(child_ica),
eliminated_rows = nrow(child_ica) - nrow(child_ica_dummy),
ratio = (nrow(child_ica)-nrow(child_ica_dummy))/nrow(child_ica))
child_ica %>%
filter(DID == 266) %>%
summarize(C002_na = sum(is.na(C002)),
C003_na = sum(is.na(C003)),
PR004_na = sum(is.na(PR004)),
PR009_na = sum(is.na(PR009)))
data.frame(original_rows = nrow(child_ica %>% filter(DID == 266)),
eliminated_rows = nrow(child_ica %>% filter(DID == 266)) - nrow(child_ica_dummy %>% filter(DID == 266)),
ratio = (nrow(child_ica %>% filter(DID == 266) %>% filter(DID == 266))-nrow(child_ica_dummy %>% filter(DID == 266)))/nrow(child_ica %>% filter(DID == 266)))
glm_child <- glm(C003_01 ~ n_children_in_household + PR004_PR009_both_01, family = "binomial", data = child_ica_dummy %>% filter(C001 >= 5, DID == 266))
ggPredict(glm_child, se = TRUE, colorAsFactor = TRUE, show.summary = TRUE, point = TRUE)
##
## Call:
## glm(formula = C003_01 ~ n_children_in_household + PR004_PR009_both_01,
## family = "binomial", data = child_ica_dummy %>% filter(C001 >=
## 5, DID == 266))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2554 0.1395 0.1646 0.2860 0.5418
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.1748 0.5255 7.944 1.96e-15 ***
## n_children_in_household -0.3329 0.1099 -3.028 0.002461 **
## PR004_PR009_both_01 1.4519 0.3926 3.698 0.000217 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 384.83 on 1409 degrees of freedom
## Residual deviance: 349.37 on 1407 degrees of freedom
## AIC: 355.37
##
## Number of Fisher Scoring iterations: 7
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
hoslem.test(x = glm_child$y, y = fitted(glm_child))
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: glm_child$y, fitted(glm_child)
## X-squared = 5.41, df = 8, p-value = 0.713
extractAIC(glm_child)
## [1] 3.0000 355.3652
extractAIC(glm_child, k = log(nrow(glm_child$data)))
## [1] 3.0000 371.1192
glm_child_null <- glm(C003_01~1, family = "binomial", data = child_ica_dummy %>% filter(C001 >= 5, DID == 266))
anova(glm_child_null, glm_child, test = "Chisq")
step(glm_child_null, direction = "both",
scope = (~ C001 + C002_01 + PR004_PR009_both_01 + n_children_in_household))
## Start: AIC=386.83
## C003_01 ~ 1
##
## Df Deviance AIC
## + PR004_PR009_both_01 1 358.59 362.59
## + n_children_in_household 1 365.61 369.61
## <none> 384.83 386.83
## + C001 1 384.22 388.22
## + C002_01 1 384.61 388.61
##
## Step: AIC=362.59
## C003_01 ~ PR004_PR009_both_01
##
## Df Deviance AIC
## + n_children_in_household 1 349.37 355.37
## <none> 358.59 362.59
## + C002_01 1 358.16 364.16
## + C001 1 358.55 364.55
## - PR004_PR009_both_01 1 384.83 386.83
##
## Step: AIC=355.37
## C003_01 ~ PR004_PR009_both_01 + n_children_in_household
##
## Df Deviance AIC
## <none> 349.37 355.37
## + C002_01 1 348.46 356.46
## + C001 1 349.34 357.34
## - n_children_in_household 1 358.59 362.59
## - PR004_PR009_both_01 1 365.61 369.61
##
## Call: glm(formula = C003_01 ~ PR004_PR009_both_01 + n_children_in_household,
## family = "binomial", data = child_ica_dummy %>% filter(C001 >=
## 5, DID == 266))
##
## Coefficients:
## (Intercept) PR004_PR009_both_01 n_children_in_household
## 4.1748 1.4519 -0.3329
##
## Degrees of Freedom: 1409 Total (i.e. Null); 1407 Residual
## Null Deviance: 384.8
## Residual Deviance: 349.4 AIC: 355.4
vif(glm_child)
## n_children_in_household PR004_PR009_both_01
## 1.068705 1.068705
glm_child <- glm(C003_01 ~ n_children_in_household + PR004_PR009_both_01 + relevel(DID, ref = "266"), family = "binomial", data = child_ica_dummy %>% filter(C001 >= 5))
glm_child %>% summary()
##
## Call:
## glm(formula = C003_01 ~ n_children_in_household + PR004_PR009_both_01 +
## relevel(DID, ref = "266"), family = "binomial", data = child_ica_dummy %>%
## filter(C001 >= 5))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2002 0.1836 0.4668 0.6971 1.5731
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.509591 0.156181 22.471 < 2e-16 ***
## n_children_in_household -0.107577 0.003969 -27.104 < 2e-16 ***
## PR004_PR009_both_01 0.673143 0.017386 38.717 < 2e-16 ***
## relevel(DID, ref = "266")146 -0.875895 0.194633 -4.500 6.79e-06 ***
## relevel(DID, ref = "266")147 -1.482768 0.172762 -8.583 < 2e-16 ***
## relevel(DID, ref = "266")148 -0.915625 0.179277 -5.107 3.27e-07 ***
## relevel(DID, ref = "266")149 -1.643550 0.168400 -9.760 < 2e-16 ***
## relevel(DID, ref = "266")150 -1.607819 0.168098 -9.565 < 2e-16 ***
## relevel(DID, ref = "266")151 -0.807095 0.185542 -4.350 1.36e-05 ***
## relevel(DID, ref = "266")152 -0.717054 0.196612 -3.647 0.000265 ***
## relevel(DID, ref = "266")153 -1.736759 0.170212 -10.203 < 2e-16 ***
## relevel(DID, ref = "266")154 -1.553572 0.172629 -8.999 < 2e-16 ***
## relevel(DID, ref = "266")155 -1.670105 0.167193 -9.989 < 2e-16 ***
## relevel(DID, ref = "266")156 -2.140333 0.181540 -11.790 < 2e-16 ***
## relevel(DID, ref = "266")157 -1.103420 0.181688 -6.073 1.25e-09 ***
## relevel(DID, ref = "266")158 -1.156450 0.177290 -6.523 6.89e-11 ***
## relevel(DID, ref = "266")159 -2.184450 0.172128 -12.691 < 2e-16 ***
## relevel(DID, ref = "266")160 -1.709068 0.169995 -10.054 < 2e-16 ***
## relevel(DID, ref = "266")161 -2.304469 0.163081 -14.131 < 2e-16 ***
## relevel(DID, ref = "266")162 -0.630815 0.203630 -3.098 0.001949 **
## relevel(DID, ref = "266")163 -1.012210 0.196169 -5.160 2.47e-07 ***
## relevel(DID, ref = "266")164 -1.659793 0.180095 -9.216 < 2e-16 ***
## relevel(DID, ref = "266")165 -1.903771 0.169885 -11.206 < 2e-16 ***
## relevel(DID, ref = "266")166 -1.684691 0.171039 -9.850 < 2e-16 ***
## relevel(DID, ref = "266")167 -0.396569 0.202148 -1.962 0.049789 *
## relevel(DID, ref = "266")169 -2.712608 0.163731 -16.568 < 2e-16 ***
## relevel(DID, ref = "266")170 -1.222926 0.177931 -6.873 6.28e-12 ***
## relevel(DID, ref = "266")171 -1.205940 0.180436 -6.683 2.33e-11 ***
## relevel(DID, ref = "266")172 -1.385540 0.175339 -7.902 2.74e-15 ***
## relevel(DID, ref = "266")173 -1.117779 0.177247 -6.306 2.86e-10 ***
## relevel(DID, ref = "266")174 -1.723520 0.172349 -10.000 < 2e-16 ***
## relevel(DID, ref = "266")175 -2.293485 0.166458 -13.778 < 2e-16 ***
## relevel(DID, ref = "266")176 -0.389622 0.228612 -1.704 0.088327 .
## relevel(DID, ref = "266")177 -1.775120 0.168013 -10.565 < 2e-16 ***
## relevel(DID, ref = "266")178 -0.088396 0.210848 -0.419 0.675040
## relevel(DID, ref = "266")179 -1.150526 0.179781 -6.400 1.56e-10 ***
## relevel(DID, ref = "266")180 -1.451699 0.173220 -8.381 < 2e-16 ***
## relevel(DID, ref = "266")181 -1.148265 0.182264 -6.300 2.98e-10 ***
## relevel(DID, ref = "266")182 -2.301266 0.165160 -13.934 < 2e-16 ***
## relevel(DID, ref = "266")183 -1.559035 0.172473 -9.039 < 2e-16 ***
## relevel(DID, ref = "266")184 -2.161783 0.165073 -13.096 < 2e-16 ***
## relevel(DID, ref = "266")185 -0.255138 0.189408 -1.347 0.177970
## relevel(DID, ref = "266")186 -2.386320 0.164891 -14.472 < 2e-16 ***
## relevel(DID, ref = "266")187 -2.162162 0.166537 -12.983 < 2e-16 ***
## relevel(DID, ref = "266")188 -1.932412 0.167238 -11.555 < 2e-16 ***
## relevel(DID, ref = "266")189 -1.114197 0.175976 -6.332 2.43e-10 ***
## relevel(DID, ref = "266")190 -1.830149 0.174173 -10.508 < 2e-16 ***
## relevel(DID, ref = "266")191 -1.648106 0.174426 -9.449 < 2e-16 ***
## relevel(DID, ref = "266")192 -1.337355 0.176637 -7.571 3.70e-14 ***
## relevel(DID, ref = "266")193 -0.664444 0.179964 -3.692 0.000222 ***
## relevel(DID, ref = "266")194 -2.631858 0.164251 -16.023 < 2e-16 ***
## relevel(DID, ref = "266")195 -2.789213 0.165049 -16.899 < 2e-16 ***
## relevel(DID, ref = "266")196 -1.025130 0.192010 -5.339 9.35e-08 ***
## relevel(DID, ref = "266")197 -2.305263 0.165346 -13.942 < 2e-16 ***
## relevel(DID, ref = "266")198 -2.639440 0.164133 -16.081 < 2e-16 ***
## relevel(DID, ref = "266")199 0.401975 0.214627 1.873 0.061083 .
## relevel(DID, ref = "266")200 -2.379658 0.167313 -14.223 < 2e-16 ***
## relevel(DID, ref = "266")202 -2.257160 0.166358 -13.568 < 2e-16 ***
## relevel(DID, ref = "266")203 -2.441492 0.163484 -14.934 < 2e-16 ***
## relevel(DID, ref = "266")204 -1.682783 0.169939 -9.902 < 2e-16 ***
## relevel(DID, ref = "266")205 -2.500618 0.163197 -15.323 < 2e-16 ***
## relevel(DID, ref = "266")206 -2.476645 0.163259 -15.170 < 2e-16 ***
## relevel(DID, ref = "266")207 -3.060126 0.163533 -18.713 < 2e-16 ***
## relevel(DID, ref = "266")208 -2.343127 0.164555 -14.239 < 2e-16 ***
## relevel(DID, ref = "266")209 -1.662136 0.175557 -9.468 < 2e-16 ***
## relevel(DID, ref = "266")210 -2.666322 0.162637 -16.394 < 2e-16 ***
## relevel(DID, ref = "266")211 -1.806394 0.173698 -10.400 < 2e-16 ***
## relevel(DID, ref = "266")212 -2.424800 0.165135 -14.684 < 2e-16 ***
## relevel(DID, ref = "266")213 -2.517405 0.163698 -15.378 < 2e-16 ***
## relevel(DID, ref = "266")214 -2.856169 0.163880 -17.428 < 2e-16 ***
## relevel(DID, ref = "266")215 -1.894926 0.163571 -11.585 < 2e-16 ***
## relevel(DID, ref = "266")216 -2.667465 0.165232 -16.144 < 2e-16 ***
## relevel(DID, ref = "266")217 -2.621078 0.164739 -15.910 < 2e-16 ***
## relevel(DID, ref = "266")218 -2.373729 0.162463 -14.611 < 2e-16 ***
## relevel(DID, ref = "266")219 -2.573537 0.165147 -15.583 < 2e-16 ***
## relevel(DID, ref = "266")220 -2.929650 0.161316 -18.161 < 2e-16 ***
## relevel(DID, ref = "266")221 -3.388760 0.164526 -20.597 < 2e-16 ***
## relevel(DID, ref = "266")222 -2.641607 0.162435 -16.263 < 2e-16 ***
## relevel(DID, ref = "266")223 -2.335626 0.171366 -13.629 < 2e-16 ***
## relevel(DID, ref = "266")224 -2.947502 0.163771 -17.998 < 2e-16 ***
## relevel(DID, ref = "266")225 -2.765260 0.164207 -16.840 < 2e-16 ***
## relevel(DID, ref = "266")226 -2.445937 0.166589 -14.682 < 2e-16 ***
## relevel(DID, ref = "266")227 -2.505052 0.163056 -15.363 < 2e-16 ***
## relevel(DID, ref = "266")228 -3.436061 0.165374 -20.778 < 2e-16 ***
## relevel(DID, ref = "266")229 -2.788615 0.166051 -16.794 < 2e-16 ***
## relevel(DID, ref = "266")230 -2.951117 0.163918 -18.004 < 2e-16 ***
## relevel(DID, ref = "266")231 -2.540898 0.163221 -15.567 < 2e-16 ***
## relevel(DID, ref = "266")232 -1.303620 0.172806 -7.544 4.56e-14 ***
## relevel(DID, ref = "266")233 -3.337002 0.166821 -20.004 < 2e-16 ***
## relevel(DID, ref = "266")234 -2.488387 0.169736 -14.660 < 2e-16 ***
## relevel(DID, ref = "266")235 -1.659018 0.168694 -9.834 < 2e-16 ***
## relevel(DID, ref = "266")236 -1.870621 0.169750 -11.020 < 2e-16 ***
## relevel(DID, ref = "266")237 -1.475577 0.173747 -8.493 < 2e-16 ***
## relevel(DID, ref = "266")238 1.232565 0.295481 4.171 3.03e-05 ***
## relevel(DID, ref = "266")239 -1.145610 0.177968 -6.437 1.22e-10 ***
## relevel(DID, ref = "266")240 -1.659761 0.170571 -9.731 < 2e-16 ***
## relevel(DID, ref = "266")241 -2.841397 0.164413 -17.282 < 2e-16 ***
## relevel(DID, ref = "266")242 -1.867780 0.170169 -10.976 < 2e-16 ***
## relevel(DID, ref = "266")243 0.320346 0.303443 1.056 0.291104
## relevel(DID, ref = "266")244 -1.201486 0.181179 -6.632 3.32e-11 ***
## relevel(DID, ref = "266")245 -0.297386 0.202708 -1.467 0.142358
## relevel(DID, ref = "266")246 -2.358940 0.166469 -14.170 < 2e-16 ***
## relevel(DID, ref = "266")247 -2.172820 0.171802 -12.647 < 2e-16 ***
## relevel(DID, ref = "266")248 -1.678299 0.165842 -10.120 < 2e-16 ***
## relevel(DID, ref = "266")249 -0.647963 0.196336 -3.300 0.000966 ***
## relevel(DID, ref = "266")250 -1.467809 0.170954 -8.586 < 2e-16 ***
## relevel(DID, ref = "266")251 -1.356769 0.175621 -7.726 1.11e-14 ***
## relevel(DID, ref = "266")252 -1.572819 0.170335 -9.234 < 2e-16 ***
## relevel(DID, ref = "266")253 -1.325726 0.178398 -7.431 1.08e-13 ***
## relevel(DID, ref = "266")254 -2.523487 0.180001 -14.019 < 2e-16 ***
## relevel(DID, ref = "266")255 -1.601880 0.175767 -9.114 < 2e-16 ***
## relevel(DID, ref = "266")256 -0.530739 0.181964 -2.917 0.003537 **
## relevel(DID, ref = "266")257 -0.491253 0.215678 -2.278 0.022743 *
## relevel(DID, ref = "266")258 -2.397226 0.164104 -14.608 < 2e-16 ***
## relevel(DID, ref = "266")259 -0.120495 0.218934 -0.550 0.582065
## relevel(DID, ref = "266")260 -0.579001 0.183002 -3.164 0.001557 **
## relevel(DID, ref = "266")261 -3.072381 0.162677 -18.886 < 2e-16 ***
## relevel(DID, ref = "266")262 -1.110709 0.174535 -6.364 1.97e-10 ***
## relevel(DID, ref = "266")263 -0.618894 0.185937 -3.329 0.000873 ***
## relevel(DID, ref = "266")264 -0.817633 0.179256 -4.561 5.09e-06 ***
## relevel(DID, ref = "266")265 -0.190128 0.204767 -0.929 0.353142
## relevel(DID, ref = "266")267 -0.203894 0.200998 -1.014 0.310388
## relevel(DID, ref = "266")268 -1.662288 0.179179 -9.277 < 2e-16 ***
## relevel(DID, ref = "266")269 0.958226 0.302515 3.168 0.001537 **
## relevel(DID, ref = "266")270 -0.206918 0.198848 -1.041 0.298067
## relevel(DID, ref = "266")271 0.511318 0.257880 1.983 0.047393 *
## relevel(DID, ref = "266")272 0.933463 0.295507 3.159 0.001584 **
## relevel(DID, ref = "266")273 -0.224661 0.210888 -1.065 0.286736
## relevel(DID, ref = "266")274 0.181126 0.269851 0.671 0.502089
## relevel(DID, ref = "266")275 -1.877979 0.170434 -11.019 < 2e-16 ***
## relevel(DID, ref = "266")276 1.254633 0.295322 4.248 2.15e-05 ***
## relevel(DID, ref = "266")277 -1.036492 0.226583 -4.574 4.77e-06 ***
## relevel(DID, ref = "266")278 -1.035538 0.167092 -6.197 5.74e-10 ***
## relevel(DID, ref = "266")279 -2.582915 0.164997 -15.654 < 2e-16 ***
## relevel(DID, ref = "266")280 -1.305004 0.172797 -7.552 4.28e-14 ***
## relevel(DID, ref = "266")281 -0.702178 0.180060 -3.900 9.63e-05 ***
## relevel(DID, ref = "266")282 -1.272273 0.181308 -7.017 2.26e-12 ***
## relevel(DID, ref = "266")284 -1.538773 0.170887 -9.005 < 2e-16 ***
## relevel(DID, ref = "266")287 -1.485674 0.175398 -8.470 < 2e-16 ***
## relevel(DID, ref = "266")289 -1.862308 0.166027 -11.217 < 2e-16 ***
## relevel(DID, ref = "266")290 -1.680815 0.170166 -9.878 < 2e-16 ***
## relevel(DID, ref = "266")315 -1.335461 0.190690 -7.003 2.50e-12 ***
## relevel(DID, ref = "266")316 -2.145252 0.167764 -12.787 < 2e-16 ***
## relevel(DID, ref = "266")318 -2.754422 0.168236 -16.372 < 2e-16 ***
## relevel(DID, ref = "266")319 -3.219120 0.163022 -19.747 < 2e-16 ***
## relevel(DID, ref = "266")320 -2.641120 0.168331 -15.690 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 212206 on 208249 degrees of freedom
## Residual deviance: 183508 on 208104 degrees of freedom
## AIC: 183800
##
## Number of Fisher Scoring iterations: 7
glm_child %>%
summary() %>%
.$coefficient %>%
as.data.frame() %>%
select(`Pr(>|z|)`) %>%
filter(`Pr(>|z|)` > 0.05) %>%
row.names() %>%
str_split("\\)") %>%
as.data.frame() %>%
gather() %>%
select(value) %>%
.[seq(2,24,2),] %>%
as.numeric()
## [1] 176 178 185 199 243 245 259 265 267 270 273 274
glm_child <- glm(C003_01 ~ n_children_in_household + PR004_PR009_both_01, family = "binomial", data = child_ica_dummy %>% filter(C001 >= 5, DID %in% dists_near_hunza_2))
glm_child %>% summary()
##
## Call:
## glm(formula = C003_01 ~ n_children_in_household + PR004_PR009_both_01,
## family = "binomial", data = child_ica_dummy %>% filter(C001 >=
## 5, DID %in% dists_near_hunza_2))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8439 0.2201 0.2575 0.2913 0.4617
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.61568 0.12770 28.314 < 2e-16 ***
## n_children_in_household -0.15896 0.03082 -5.158 2.49e-07 ***
## PR004_PR009_both_01 0.56938 0.08634 6.595 4.25e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5235.5 on 16790 degrees of freedom
## Residual deviance: 5161.7 on 16788 degrees of freedom
## AIC: 5167.7
##
## Number of Fisher Scoring iterations: 6
hoslem.test(x = glm_child$y, y = fitted(glm_child))
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: glm_child$y, fitted(glm_child)
## X-squared = 12.573, df = 8, p-value = 0.1274
glm_child <- glm(C003_01 ~ n_children_in_household + PR004_PR009_both_01 + DID_near_hunza_2, family = "binomial", data = child_ica_dummy %>% filter(C001 >= 5))
glm_child %>% summary()
##
## Call:
## glm(formula = C003_01 ~ n_children_in_household + PR004_PR009_both_01 +
## DID_near_hunza_2, family = "binomial", data = child_ica_dummy %>%
## filter(C001 >= 5))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9326 0.3055 0.5206 0.7875 1.1187
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.448166 0.015604 92.81 <2e-16 ***
## n_children_in_household -0.109043 0.003552 -30.70 <2e-16 ***
## PR004_PR009_both_01 1.136312 0.015705 72.36 <2e-16 ***
## DID_near_hunza_2 1.811042 0.041876 43.25 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 212206 on 208249 degrees of freedom
## Residual deviance: 199881 on 208246 degrees of freedom
## AIC: 199889
##
## Number of Fisher Scoring iterations: 5
hoslem.test(x = glm_child$y, y = fitted(glm_child))
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: glm_child$y, fitted(glm_child)
## X-squared = 33.224, df = 8, p-value = 5.611e-05
glm_child <- glm(C003_01 ~ n_children_in_household + C002_01 + PR004_PR009_both_01 + as.factor(H004), family = "binomial", data = child_ica_dummy %>% filter(C001 >= 5, !DID %in% dists_near_hunza_2))
glm_child %>% summary()
##
## Call:
## glm(formula = C003_01 ~ n_children_in_household + C002_01 + PR004_PR009_both_01 +
## as.factor(H004), family = "binomial", data = child_ica_dummy %>%
## filter(C001 >= 5, !DID %in% dists_near_hunza_2))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4357 0.3420 0.5790 0.7647 1.4179
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.908080 0.017456 109.31 <2e-16 ***
## n_children_in_household -0.103530 0.003677 -28.16 <2e-16 ***
## C002_01 -0.779046 0.011555 -67.42 <2e-16 ***
## PR004_PR009_both_01 1.108833 0.016443 67.44 <2e-16 ***
## as.factor(H004)0 -0.643349 0.014718 -43.71 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 200070 on 189127 degrees of freedom
## Residual deviance: 185935 on 189123 degrees of freedom
## (2331 observations deleted due to missingness)
## AIC: 185945
##
## Number of Fisher Scoring iterations: 4
hoslem.test(x = glm_child$y, y = fitted(glm_child))
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: glm_child$y, fitted(glm_child)
## X-squared = 156.66, df = 8, p-value < 2.2e-16
vif(glm_child)
## n_children_in_household C002_01 PR004_PR009_both_01
## 1.008000 1.008143 1.033210
## as.factor(H004)
## 1.024592
g_n <- gschool %>%
group_by(DID) %>%
summarize(g_n = sum(unique(GSID)))
## `summarise()` ungrouping output (override with `.groups` argument)
p_n <- pschool %>%
group_by(DID) %>%
summarize(p_n = sum(unique(PSID)))
## `summarise()` ungrouping output (override with `.groups` argument)
school_type <- g_n %>% full_join(p_n) %>%
mutate(ratio = p_n/(g_n + p_n))
## Joining, by = "DID"
school_type$DID <- as.factor(school_type$DID)
child_ica_dummy %>%
full_join(school_type)
## Joining, by = "DID"
glm_child <- glm(C003_01 ~ ratio + C002_01 + PR004_PR009_both_01, family = "binomial", data = child_ica_dummy %>% full_join(school_type) %>% filter(C001 >= 5, !DID %in% dists_near_hunza_2))
## Joining, by = "DID"
## Joining, by = "DID"
ggPredict(glm_child, se = TRUE, colorAsFactor = TRUE, show.summary = TRUE, point = TRUE)
##
## Call:
## glm(formula = C003_01 ~ ratio + C002_01 + PR004_PR009_both_01,
## family = "binomial", data = child_ica_dummy %>% full_join(school_type) %>%
## filter(C001 >= 5, !DID %in% dists_near_hunza_2))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6413 0.3124 0.5318 0.7304 1.0411
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.06540 0.01338 79.65 <2e-16 ***
## ratio 2.04959 0.04468 45.87 <2e-16 ***
## C002_01 -0.75283 0.01401 -53.75 <2e-16 ***
## PR004_PR009_both_01 1.11872 0.01922 58.20 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 140074 on 143360 degrees of freedom
## Residual deviance: 129578 on 143357 degrees of freedom
## (48098 observations deleted due to missingness)
## AIC: 129586
##
## Number of Fisher Scoring iterations: 5
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
# glm_child %>% summary()
hoslem.test(x = glm_child$y, y = fitted(glm_child))
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: glm_child$y, fitted(glm_child)
## X-squared = 602.05, df = 8, p-value < 2.2e-16
glm_child <- glm(C003_01 ~ n_children_in_household + ratio + C002_01 + PR004_PR009_both_01, family = "binomial", data = child_ica_dummy %>% full_join(school_type) %>% filter(C001 >= 5, !DID %in% dists_near_hunza_2))
## Joining, by = "DID"
## Joining, by = "DID"
ggPredict(glm_child, se = TRUE, colorAsFactor = TRUE, show.summary = TRUE, point = TRUE)
##
## Call:
## glm(formula = C003_01 ~ n_children_in_household + ratio + C002_01 +
## PR004_PR009_both_01, family = "binomial", data = child_ica_dummy %>%
## full_join(school_type) %>% filter(C001 >= 5, !DID %in% dists_near_hunza_2))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7354 0.3087 0.5204 0.7147 1.3120
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.479115 0.022225 66.55 <2e-16 ***
## n_children_in_household -0.103089 0.004352 -23.69 <2e-16 ***
## ratio 2.033637 0.044722 45.47 <2e-16 ***
## C002_01 -0.743131 0.014044 -52.91 <2e-16 ***
## PR004_PR009_both_01 1.077862 0.019318 55.80 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 140074 on 143360 degrees of freedom
## Residual deviance: 129023 on 143356 degrees of freedom
## (48098 observations deleted due to missingness)
## AIC: 129033
##
## Number of Fisher Scoring iterations: 5
## Warning in ggPredict(glm_child, se = TRUE, colorAsFactor = TRUE, show.summary =
## TRUE, : maximum three independent variables are allowed
## NULL
# glm_child %>% summary()
hoslem.test(x = glm_child$y, y = fitted(glm_child))
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: glm_child$y, fitted(glm_child)
## X-squared = 254.37, df = 8, p-value < 2.2e-16
each_dist <- sapply(as.numeric(as.character(unique(child_ica_dummy$DID))), function(id){
child_ica_dummy <- child_ica_dummy %>% filter(DID == id)
glm_child <- glm(C003_01 ~ n_children_in_household + PR004_PR009_both_01,
family = "binomial", data = child_ica_dummy %>% filter(C001 >= 5))
temp_hoslem <- hoslem.test(x = glm_child$y, y = fitted(glm_child))
data.frame(DID = id, hoslem_p_value = temp_hoslem$p.value)
}) %>% t()
each_dist
## DID hoslem_p_value
## [1,] 146 0.4833365
## [2,] 147 0.9696368
## [3,] 148 0.2340054
## [4,] 149 0.263045
## [5,] 150 0.4052944
## [6,] 151 0.6107987
## [7,] 152 0.5295676
## [8,] 153 0.2108079
## [9,] 154 0.8521452
## [10,] 155 0.9999996
## [11,] 156 0.4243112
## [12,] 157 0.1996798
## [13,] 158 0.269862
## [14,] 159 0.957696
## [15,] 160 0.1873412
## [16,] 161 0.09862571
## [17,] 162 0.970035
## [18,] 163 0.07888207
## [19,] 164 0.8745456
## [20,] 165 0.009088681
## [21,] 166 0.6417484
## [22,] 167 0.8832939
## [23,] 169 0.9760668
## [24,] 170 0.3695271
## [25,] 171 0.1588784
## [26,] 172 0.07933903
## [27,] 173 0.009982887
## [28,] 174 0.1314211
## [29,] 175 0.9932377
## [30,] 176 0.9237029
## [31,] 177 0.5996293
## [32,] 178 0.6349025
## [33,] 179 0.2552938
## [34,] 180 0.5420001
## [35,] 181 0.7728695
## [36,] 182 0.9210615
## [37,] 183 0.00772995
## [38,] 184 0.6966427
## [39,] 185 0.1115057
## [40,] 186 0.6361407
## [41,] 187 NaN
## [42,] 188 0.6543812
## [43,] 189 0.5089121
## [44,] 190 0.4263431
## [45,] 191 0.9611336
## [46,] 192 0.2009149
## [47,] 193 6.445687e-06
## [48,] 194 0.615997
## [49,] 195 0.2881515
## [50,] 196 0.003757288
## [51,] 197 0.959232
## [52,] 198 0.3930063
## [53,] 199 0.3020499
## [54,] 200 0.2915802
## [55,] 202 0.8220956
## [56,] 203 3.982822e-07
## [57,] 204 0.7540585
## [58,] 315 0.4577361
## [59,] 316 0.01377099
## [60,] 320 0.03400093
## [61,] 205 0.9858706
## [62,] 206 0.9781201
## [63,] 207 0.9324696
## [64,] 208 0.9349555
## [65,] 209 0.8148914
## [66,] 210 0.1928247
## [67,] 211 0.9737445
## [68,] 212 0.9604219
## [69,] 213 0.8909924
## [70,] 214 0.8231951
## [71,] 215 0.6468684
## [72,] 216 0.927478
## [73,] 217 0.8091547
## [74,] 218 0.6925247
## [75,] 219 0.851332
## [76,] 220 0.1065343
## [77,] 221 0.9281304
## [78,] 222 0.2759677
## [79,] 223 0.8395059
## [80,] 224 0.8469389
## [81,] 225 0.9953924
## [82,] 226 0.1391444
## [83,] 227 0.4715612
## [84,] 228 0.951791
## [85,] 229 1
## [86,] 230 0.9506553
## [87,] 231 0.999271
## [88,] 232 0.002364582
## [89,] 233 0.8167503
## [90,] 234 0.2029032
## [91,] 318 0.8704222
## [92,] 319 0.2196002
## [93,] 235 0.2113758
## [94,] 236 0.02945804
## [95,] 237 0.2517542
## [96,] 238 0.6276721
## [97,] 239 0.2157016
## [98,] 240 0.1282194
## [99,] 241 5.197638e-05
## [100,] 242 0.6327375
## [101,] 243 0.9503456
## [102,] 244 0.2637431
## [103,] 245 0.9922232
## [104,] 246 0.005520648
## [105,] 247 0.2720873
## [106,] 248 0.008590541
## [107,] 249 0.4451278
## [108,] 250 0.3860093
## [109,] 251 0.9431389
## [110,] 252 0.363282
## [111,] 253 0.9562219
## [112,] 254 0.5528683
## [113,] 255 0.2014395
## [114,] 256 0.655442
## [115,] 257 0.006877678
## [116,] 258 0.0006264664
## [117,] 259 0.7785989
## [118,] 260 0.07112161
## [119,] 261 0.8481883
## [120,] 262 0.4452464
## [121,] 263 0.9315994
## [122,] 264 0.8451285
## [123,] 265 0.9716815
## [124,] 266 0.7129842
## [125,] 267 0.1644173
## [126,] 268 0.03273428
## [127,] 269 0.9902921
## [128,] 270 0.9496613
## [129,] 271 0.9579085
## [130,] 272 0.8863188
## [131,] 273 0.4208476
## [132,] 274 0.7171719
## [133,] 275 0.01355154
## [134,] 276 0.9920801
## [135,] 277 0.06343009
## [136,] 278 0.01073304
## [137,] 279 0.6630176
## [138,] 280 0.4948066
## [139,] 281 0.9708145
## [140,] 282 0.8897523
## [141,] 284 0.7440368
## [142,] 287 0.3178372
## [143,] 289 0.05144863
## [144,] 290 0.1967437
each_dist <- each_dist %>%
as.data.frame()
each_dist$DID <- each_dist %>%
pull(DID) %>%
as.numeric()
each_dist$hoslem_p_value <- each_dist %>%
pull(hoslem_p_value) %>%
as.numeric()
each_dist
each_dist %>%
filter(hoslem_p_value <= 0.05)
dists_not_fit <- each_dist %>%
filter(hoslem_p_value <= 0.05) %>%
pull(DID)
glm_child <- glm(C003_01 ~ n_children_in_household + PR004_PR009_both_01, family = "binomial", data = child_ica_dummy %>% filter(C001 >= 5, !DID %in% dists_not_fit))
glm_child %>% summary()
##
## Call:
## glm(formula = C003_01 ~ n_children_in_household + PR004_PR009_both_01,
## family = "binomial", data = child_ica_dummy %>% filter(C001 >=
## 5, !DID %in% dists_not_fit))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3662 0.3771 0.6920 0.7748 1.1695
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.564935 0.016627 94.12 <2e-16 ***
## n_children_in_household -0.128849 0.003823 -33.70 <2e-16 ***
## PR004_PR009_both_01 1.300495 0.017287 75.23 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 185614 on 181260 degrees of freedom
## Residual deviance: 176655 on 181258 degrees of freedom
## AIC: 176661
##
## Number of Fisher Scoring iterations: 5
hoslem.test(x = glm_child$y, y = fitted(glm_child))
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: glm_child$y, fitted(glm_child)
## X-squared = 44.192, df = 8, p-value = 5.235e-07